
import json
import csv
import numpy as np
from scipy.stats import wilcoxon


################################################################################

def conf_int_95(values):
    n = float(len(values))
    se = np.std(values) / np.sqrt(n)
    ci = se * 1.96
    return ci

################################################################################

def test_significance(in_fname):
    results = json.load(open(in_fname))

    #
    clfs = [
        "linear_svm",
        "logistic_regression",
        "gradient_boosting",
        "random_forest"
    ]

    for i in range(len(clfs)):
        for j in range(i + 1, len(clfs)):
            clf_i = clfs[i]
            clf_j = clfs[j]

            clf_i_acc = results[clf_i]["test_accuracy"]
            clf_j_acc = results[clf_j]["test_accuracy"]

            T, p = wilcoxon(clf_i_acc, clf_j_acc, zero_method='wilcox')

            print "%-30s | %-30s: %d (p = %.5f)" % (
                clf_i, clf_j, T, p)

    print "------"

################################################################################

def output_results(in_fname, out_fname):

    results = json.load(open(in_fname))

    # compute aggregated results
    agg_results = []

    for clf_name in sorted(results.keys()):
        if clf_name == "settings":
            continue
        clf_res = results[clf_name]
        agg_results.append(
            {
                "clf_name": clf_name,
                "auc_mean": np.mean(clf_res['test_roc_auc']),
                "auc_std": np.std(clf_res['test_roc_auc']),
                "auc_ci": conf_int_95(clf_res['test_roc_auc']),
                "acc_mean": np.mean(clf_res['test_accuracy']),
                "acc_std": np.std(clf_res['test_accuracy']),
                "acc_ci": conf_int_95(clf_res['test_accuracy'])
            }
        )

    # output csv
    output_fields = [
        "clf_name",
        "auc_mean",
        "auc_std",
        "auc_ci",
        "acc_mean",
        "acc_std",
        "acc_ci"
    ]
    csv_writer = csv.DictWriter(open(out_fname, "w"), output_fields)
    csv_writer.writeheader()
    for agg_result in agg_results:
        csv_writer.writerow(agg_result)

    print "Done!"

################################################################################

def main():
    in_fname = "../../data/pred_res_nested/pred_wide_not_scaled.json"
    out_fname = "../../data/pred_res_nested/agg_wide_not_scaled.csv"

    output_results(in_fname, out_fname)
    # test_significance(in_fname)

################################################################################

if __name__ == "__main__":
    main()

# END
